---
title: "Food Inspections Group Final"
author: "Emma Dempsey, Jerry Adams Franklin, Audrey Lamont, and Ty Mulholland"
date: "April 13, 2022"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

#to load needed packages:
library(ggplot2)
library(tidyr)
library(tidyverse)
library("data.table")
library(ggthemes)
library(dplyr)
library(dataverse)
library(sqldf)
library(sf)
library(tigris)
library(tidycensus)
library(ggmap)
library(leaflet)
library(tmap)
library(tmaptools)
library(shinyjs)
library(tm)
library(lubridate)
```


```{r, message=FALSE}
options(scipen = 100)

#to load datasets from the dataverse:
Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
yelp<- get_dataframe_by_name("yelp2.tab","10.7910/DVN/D5N473")
FIR <- get_dataframe_by_name("Food.Inspections.Records (1).tab","10.7910/DVN/WZK3VM" )

```


```{r}
#to create total inspection number and passing rate:
yelp <-mutate(yelp, Insp_Total = Inspection.Fail.Count + Inspection.Pass.Count, 
             Tract_Avg_Pass_Rate = Inspection.Pass.Count/Insp_Total, 
             Tract_Avg_Fail_Rate = Inspection.Fail.Count/Insp_Total)


#to calculate the average Yelp rating and number of reviews by tract for 
#all restaurants with Yelp data:
Reviews <-sqldf("select CT_ID_10,
        avg(ave_rating),
        avg(review_count_total)
        from yelp
        group by CT_ID_10")

#to calculate the average pass rate and Level 1 violations by tract for 
#all restaurants with Inspections data:
colnames(yelp)[11] <- "Violation_1"
Inspections <-sqldf("select CT_ID_10,
        avg(Tract_Avg_Pass_Rate),
        avg(Violation_1)
        from yelp
        group by CT_ID_10")

# to merge aggregated datasets together:
corrected <- left_join(Reviews, Inspections, by = "CT_ID_10")

#to change dates into date format, calculate difference, and time between:
FIRDate <-mutate(FIR, StartDate = mdy_hm(issdttm), EndDate = mdy_hm(expdttm), 
                 TimeDiff =time_length(difftime(EndDate, StartDate), "years"), 
                 TimeDiffAlt = time_length(difftime(StartDate, EndDate), "years"),
                 Trange = interval(StartDate, EndDate))

#to select the correct difference:
FIRDate <-mutate(FIRDate, TimeDiffCorrect = 
                     ifelse(TimeDiff > TimeDiffAlt, TimeDiff, TimeDiffAlt))

#to aggregate by Census Tract:
Time <- FIRDate %>%
    group_by(CT_ID_10) %>%
    summarise(AvgYears = mean(TimeDiffCorrect))

#to merge with the updated dataset:
latent <- left_join(corrected, Time, by = "CT_ID_10")

#to rename columns:
colnames(latent) <-c("CT_ID_10","Tract_Avg_Rating","Tract_Avg_Review_Count","Tract_Avg_Pass","Tract_Avg_Level_1","Operating_Years")

#to create z-scores:
standardized <- mutate(latent, NRating = (Tract_Avg_Rating-mean(Tract_Avg_Rating, na.rm=T))/sd(Tract_Avg_Rating, na.rm=T),
            NReview = (Tract_Avg_Review_Count-mean(Tract_Avg_Review_Count, na.rm=T))/sd(Tract_Avg_Review_Count, na.rm=T),
            NPass = (Tract_Avg_Pass-mean(Tract_Avg_Pass, na.rm=T))/sd(Tract_Avg_Pass, na.rm=T),
            NLevel1 = (Tract_Avg_Level_1-mean(Tract_Avg_Level_1, na.rm=T))/sd(Tract_Avg_Level_1, na.rm=T),
            NOperate = (Operating_Years-mean(Operating_Years, na.rm=T))/sd(Operating_Years, na.rm=T))


#to create average across these variables:
standardized$AllAvg <-rowMeans(standardized[, c(7:11)])

# to finalize the latent construct by calculating probability and multiplying by 100 for the score:
standardized$stability <- pnorm(standardized$AllAvg, mean = 0, sd = 1)*100

#summary of new latent construct for stability:
summary(standardized$stability)



#to merge with the yelp dataset: 
yelp2<- left_join(yelp, standardized, by = "CT_ID_10")

yelp2 <- yelp2 %>% 
  rename_at(
    vars(ends_with(".x")),
    ~str_replace(., "\\..$","")
  ) %>% 
  select_at(
    vars(-ends_with(".y"))
  )

```

```{r}
# to create consumer satisfaction latent construct:
geographic<-sqldf("select Land_Parcel_ID, sum(ave_rating) as average_rating, count(Land_Parcel_ID) as location, sum(review_count_anuave)/count(review_count_anuave) as average_annual_review_count, sum(ave_rating)/count(ave_rating) as location_average_rating from yelp2 group by Land_Parcel_ID")

#to create dataset with only land parcel ID and satisfaction score:
 geographic2 <- geographic %>%
   select(Land_Parcel_ID, location_average_rating)


 #to merge with dataset containing stability measure:
 final_yelp_data <- left_join(yelp2, geographic2, by = "Land_Parcel_ID")

```


```{r warning=FALSE, message=FALSE}


#to create desirability latent construct:
final_yelp_data$desirability <- as.factor(ifelse((final_yelp_data$ave_rating > 3.4 & 
                                      final_yelp_data$review_count_total >  50 &
                                      ((final_yelp_data$Fail_Rate_Corrected + final_yelp_data$Fail_Rate)/2) < .7)
                                     
                                   , "Desirable", "Not Desirable"))


```

```{r warning=FALSE, message=FALSE}
#to create latent construct of owner attitude toward violations:
final_yelp_data$attitude <- as.factor(ifelse((final_yelp_data$ave_rating > 3.4 & 
                                    final_yelp_data$Violation_1 < 44 &
                                    final_yelp_data$Violation.Level2.Count < 5 &
                                    final_yelp_data$Violation.Level3.Count < 10 &
                                    final_yelp_data$Violation.2018.Count <= ((final_yelp_data$Violation.2016.Count
                                                                     + final_yelp_data$Violation.2017.Count)/2) &
                                   final_yelp_data$Violation.2019.Count <= ((final_yelp_data$Violation.2016.Count
                                                                    + final_yelp_data$Violation.2017.Count
                                                                    + final_yelp_data$Violation.2018.Count)/3) &
                                   final_yelp_data$Violation.2020.Count <= ((final_yelp_data$Violation.2016.Count
                                                                    + final_yelp_data$Violation.2017.Count
                                                                    + final_yelp_data$Violation.2018.Count
                                                                    + final_yelp_data$Violation.2019.Count)/4))
                                   , "Positive", "Negetive"))

#to reposition columns:
final_yelp_data %>% relocate(attitude, .after = desirability)
```